v1-v5:Huge Stock Market Analysis

  • v3 改动:
    • train test split 放到最后,减少数据的损失(设置time step15时会损失一部分开头的数据)
    • 顺序:
      • time series to supervise
      • get y
      • scale X
      • train test split
      • train model
      • test model

  • v4改动:
    • n_in = 15,n_out =2

  • v5改变:
    • 只对于成交量进行归一化,特征添加一个:当日开盘价

  • v6改变:
    • 15天的6个特征+当日开盘价 共91个feature,做归一化
    • 第16天的Close做为target(y),不需要对其做归一化
      • 会有问题??!
      • 尝试先归一化 后去归一化
    • train test比例变高
In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import os
path = 'C:/Users/Mengjie Ye/Desktop'
os.chdir(path)
import torch.nn as nn
torch.manual_seed(12)
  • 设置像素
In [ ]:
plt.rcParams['savefig.dpi'] = 300 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率

Data Preprocessing

get data

In [ ]:
stock = pd.read_csv('./hsmd/a.us.txt')
print(stock.head())
  • 添加特征log-return
  • SettingWithCopyWarning
In [16]:
stock = new_data[4].copy()
stock = stock.iloc[2000:,:]
stock.head()
Out[16]:
Date Open High Low Close Volume OpenInt Label
2000 2013-02-07 83.30 83.39 79.74 82.34 515071 0 ssys
2001 2013-02-08 82.59 84.10 81.49 81.58 386168 0 ssys
2002 2013-02-11 82.00 82.46 80.16 80.66 334484 0 ssys
2003 2013-02-12 80.50 81.62 75.26 75.52 937083 0 ssys
2004 2013-02-13 76.27 77.57 73.52 75.81 1066981 0 ssys
In [17]:
re = np.log(stock.Close).diff(1)

stock['Return'] = re

stock.head()

stock = stock[['Open','High','Low','Close','Return','Volume']]
stock.dropna(inplace=True)#注意dropna和label的顺序 不然维数不一样
Label = stock.Close

# print(re)
print(stock.head(10))
# print(Label.shape)
print(stock.shape)
Out[17]:
Date Open High Low Close Volume OpenInt Label Return
2000 2013-02-07 83.30 83.39 79.74 82.34 515071 0 ssys NaN
2001 2013-02-08 82.59 84.10 81.49 81.58 386168 0 ssys -0.009273
2002 2013-02-11 82.00 82.46 80.16 80.66 334484 0 ssys -0.011341
2003 2013-02-12 80.50 81.62 75.26 75.52 937083 0 ssys -0.065845
2004 2013-02-13 76.27 77.57 73.52 75.81 1066981 0 ssys 0.003833
       Open   High    Low  Close    Return   Volume
2001  82.59  84.10  81.49  81.58 -0.009273   386168
2002  82.00  82.46  80.16  80.66 -0.011341   334484
2003  80.50  81.62  75.26  75.52 -0.065845   937083
2004  76.27  77.57  73.52  75.81  0.003833  1066981
2005  75.50  75.50  69.25  71.20 -0.062737  2254826
2006  71.00  71.77  68.00  68.62 -0.036909  1389022
2007  69.00  70.28  67.32  69.50  0.012743   835185
2008  69.50  70.70  68.05  68.31 -0.017271   840830
2009  67.91  67.91  64.74  66.15 -0.032131  1182314
2010  66.55  68.01  66.10  67.35  0.017978   596893
(1200, 6)
In [ ]:
print(stock.info())
In [ ]:
print(stock.isnull().sum())
In [ ]:
print(stock.describe())
  • 箱型图
In [ ]:
stock[['Open','High','Low','Close']].boxplot()
In [ ]:
stock[['Volume']].boxplot()
In [ ]:
stock[['Close']].hist(bins = 40)%save
In [ ]:
stock.corr(method = 'pearson')

convert time series to supervised prob

In [18]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg
In [19]:
# stock_sv = series_to_supervised(stock,15,2)
stock_sv = series_to_supervised(stock,15,1)
# print(stock_sv.head())
# print(stock_sv.columns)
# Close = stock_sv['var4(t)'].values
# print(Close)

# Stock = stock_sv.drop(stock_sv.columns[[-1,-2,-4,-5,-6,-7,-8,-10,-11,-12]],axis=1)
# Stock = stock_sv.drop(stock_sv.columns[[-1,-2,-4,-5,-6]],axis=1)

#前15天的6个特征+当日的开盘价
Stock = stock_sv.drop(stock_sv.columns[[-1,-2,-4,-5]],axis=1)
print(Stock.head())
      var1(t-15)  var2(t-15)  var3(t-15)  var4(t-15)  var5(t-15)  var6(t-15)  \
2016       82.59       84.10       81.49       81.58   -0.009273    386168.0   
2017       82.00       82.46       80.16       80.66   -0.011341    334484.0   
2018       80.50       81.62       75.26       75.52   -0.065845    937083.0   
2019       76.27       77.57       73.52       75.81    0.003833   1066981.0   
2020       75.50       75.50       69.25       71.20   -0.062737   2254826.0   

      var1(t-14)  var2(t-14)  var3(t-14)  var4(t-14)  ...  var5(t-2)  \
2016       82.00       82.46       80.16       80.66  ...  -0.013066   
2017       80.50       81.62       75.26       75.52  ...   0.018058   
2018       76.27       77.57       73.52       75.81  ...   0.068557   
2019       75.50       75.50       69.25       71.20  ...  -0.009637   
2020       71.00       71.77       68.00       68.62  ...  -0.017315   

      var6(t-2)  var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)  var5(t-1)  \
2016  1236443.0      61.98     64.800      61.62      64.26   0.018058   
2017   915161.0      70.00     72.690      67.53      68.82   0.068557   
2018  3756089.0      70.11     74.169      67.89      68.16  -0.009637   
2019  2446860.0      68.04     68.790      65.29      66.99  -0.017315   
2020  1307700.0      66.91     70.450      66.34      68.90   0.028113   

      var6(t-1)  var1(t)  var4(t)  
2016   915161.0    70.00    68.82  
2017  3756089.0    70.11    68.16  
2018  2446860.0    68.04    66.99  
2019  1307700.0    66.91    68.90  
2020  1087703.0    69.54    69.95  

[5 rows x 92 columns]

normalization (on features not target), train-test-split

  • 对91个特征进行归一化,需要对target进行
In [ ]:
# #对y也进行归一化,scaler_Close为了之后的去归一化
# Close_16 = Stock.values[:,-1]

# scaler = preprocessing.MinMaxScaler()
# scaler.fit(Stock)
# stock_scale = scaler.transform(Stock)
# print(stock_scale.shape)

# scaler_Close = preprocessing.MinMaxScaler()
# scaler_Close.fit(Close_16.reshape(-1,1))

# # stock_scale
# scaler_close
In [20]:
# 对于所有特征进行normalization,y不进行
X = Stock.values[:,:-1]
y = Stock.values[:,-1].reshape(-1,1)

# 对于所有特征进行normalization,y不进行
scaler = preprocessing.MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
print(X)
Out[20]:
MinMaxScaler(copy=True, feature_range=(0, 1))
[[0.55495005 0.55788439 0.55809111 ... 0.71157868 0.03123719 0.45099496]
 [0.55007844 0.54445718 0.54701424 ... 0.81374011 0.1542779  0.45190323]
 [0.53769301 0.53757983 0.50620471 ... 0.65555128 0.09757514 0.43481133]
 ...
 [0.05177112 0.05182577 0.05638378 ... 0.67029408 0.0304192  0.04780778]
 [0.05466105 0.04977894 0.05171983 ... 0.67028289 0.02161162 0.0464041 ]
 [0.05267938 0.06132307 0.05988174 ... 0.63555703 0.00858738 0.04400958]]
In [21]:
scaler_y = preprocessing.MinMaxScaler()
scaler_y.fit(y)
y = scaler_y.transform(y)
print(y)
Out[21]:
MinMaxScaler(copy=True, feature_range=(0, 1))
[[0.44200627]
 [0.43656162]
 [0.42690975]
 ...
 [0.0492493 ]
 [0.04586702]
 [0.04297971]]
In [ ]:
# # X = stock_scale[:,:-2]
# # y = stock_scale[:,-2:]
# X = stock_scale[:,:-1]
# y = stock_scale[:,-1:]
# print(X.shape,y.shape)
In [22]:
# 分割训练集和测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,test_size=0.3,shuffle = False)


print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)
(829, 91) (829, 1) (356, 91) (356, 1)

to LSTM 3D, to tensor

In [23]:
X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0],1,X_test.shape[1]))
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

# y_train,y_test = y_train.reshape((-1,2)), y_test.reshape((-1,2))
y_train,y_test = y_train.reshape((-1,1)), y_test.reshape((-1,1))
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
(829, 1, 91) (356, 1, 91) (829, 1) (356, 1)
(829, 1, 91) (356, 1, 91) (829, 1) (356, 1)
  • to tensor
In [24]:
X_train_t = torch.from_numpy(X_train).type(torch.Tensor)
y_train_t= torch.from_numpy(y_train).type(torch.Tensor)

X_test_t = torch.from_numpy(X_test).type(torch.Tensor)
y_test_t = torch.from_numpy(y_test).type(torch.Tensor)

print(X_train_t)
print(X_train_t.shape)
tensor([[[0.5550, 0.5579, 0.5581,  ..., 0.7116, 0.0312, 0.4510]],

        [[0.5501, 0.5445, 0.5470,  ..., 0.8137, 0.1543, 0.4519]],

        [[0.5377, 0.5376, 0.5062,  ..., 0.6556, 0.0976, 0.4348]],

        ...,

        [[0.0365, 0.0355, 0.0440,  ..., 0.5940, 0.0417, 0.0451]],

        [[0.0390, 0.0404, 0.0465,  ..., 0.6894, 0.0325, 0.0467]],

        [[0.0473, 0.0548, 0.0547,  ..., 0.6997, 0.0213, 0.0515]]])
torch.Size([829, 1, 91])

Refining Model

In [29]:
EPOCHS = 500             
BATCH_SIZE = 20
TIME_STEP =  1         #上面已经手动把15天的feature搞在一起了
INPUT_SIZE =  91       # X_train.shape[2]
# INPUT_SIZE =  61
HIDDEN_SIZE = 30
LR = 0.01     # learning rate
# OUTPUT_SIZE = 2
OUTPUT_SIZE = 1
In [30]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()
        
        self.LSTM = nn.LSTM(
            input_size = INPUT_SIZE,
            hidden_size = HIDDEN_SIZE,
            num_layers = 2,
            batch_first = True
        )
        
#         self.out = nn.Linear(HIDDEN_SIZE,2)
        self.out = nn.Linear(HIDDEN_SIZE,OUTPUT_SIZE)
    
    def forward(self,x):
        # x.shape (batch,time_step,input_size)
        #r_out.shape (batch,time_step,output_size)
        #h_n.shape (n_layers,batch,hidden_size)
        #h_c.shape (n_layers,batch,hidden_size)
        r_out, (h_n,h_c) = self.LSTM(x,None)
        #None : zero initial hidden state
        res = self.out(r_out[:,-1,:])
        return res
    
Lstm = LSTM()
print(Lstm)
LSTM(
  (LSTM): LSTM(91, 30, num_layers=2, batch_first=True)
  (out): Linear(in_features=30, out_features=1, bias=True)
)
In [31]:
optimizer = torch.optim.Adam(Lstm.parameters(),lr=LR)
loss_func = nn.MSELoss()

Training Model

In [32]:
hist = np.zeros(EPOCHS)
# hist_inv = np.zeros(EPOCHS) #对y进行归一化后,记录去归一化之后的结果,没必要所以全部注释掉
hist_test = np.zeros(EPOCHS)
# hist_test_inv = np.zeros(EPOCHS)
for t in range(EPOCHS):
    y_pred = Lstm(X_train_t)
    y_pred_inv = torch.tensor(scaler_y.inverse_transform(y_pred.detach().numpy().reshape(-1,1)),requires_grad=True).double()
    y_train_inv = torch.tensor(scaler_y.inverse_transform(y_train.reshape(-1,1)),requires_grad=True).double()
    loss_inv = loss_func(y_pred_inv,y_train_inv)
    loss = loss_func(y_pred,y_train_t)
    if t % 20 == 0:
#         print('Epoch:',t,'loss',loss.item())
#         print('pred',y_pred.detach().numpy()[-3:])
#         print('train',y_train[-3:])

        print('Epoch:',t,'loss',loss_inv.item())
        print('pred',y_pred_inv[:5])
        print('train',y_train_inv[:5])
#     hist[t] = loss.item()
    hist[t] = loss_inv.item()
    
    optimizer.zero_grad()
    
    loss.backward()
#     loss_inv.backward()

    
    optimizer.step()
    
    test_pre = Lstm(X_test_t)
    y_test_pred_inv = torch.tensor(scaler_y.inverse_transform(test_pre.detach().numpy().reshape(-1,1)),requires_grad=True).double()
    y_test_inv = torch.tensor(scaler_y.inverse_transform(y_test.reshape(-1,1)),requires_grad=True).double()
    loss_test = loss_func(test_pre,y_test_t)
    loss_test_inv = loss_func(y_test_pred_inv,y_test_inv)
#     hist_test[t] = loss_test.item()
    hist_test[t]=loss_test_inv.item()
Epoch: 0 loss 3658.741490660089
pred tensor([[25.1196],
        [24.9521],
        [25.0627],
        [25.1789],
        [25.0127]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 20 loss 112.0325817233682
pred tensor([[80.3941],
        [77.0752],
        [77.2827],
        [75.3119],
        [73.1256]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 40 loss 44.21564929674138
pred tensor([[67.3784],
        [65.4188],
        [65.3419],
        [64.3173],
        [63.0231]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 60 loss 22.924677799144387
pred tensor([[69.6593],
        [67.7102],
        [67.6870],
        [66.8560],
        [65.7213]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 80 loss 17.520859710963453
pred tensor([[69.1512],
        [67.2207],
        [67.6076],
        [66.9356],
        [65.8931]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 100 loss 13.718591242446031
pred tensor([[67.9159],
        [66.1767],
        [66.8321],
        [66.3335],
        [65.4747]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 120 loss 11.549620307815417
pred tensor([[66.9009],
        [65.4549],
        [66.4389],
        [66.0469],
        [65.3306]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 140 loss 10.074963512211815
pred tensor([[66.5753],
        [65.5051],
        [66.7591],
        [66.3702],
        [65.7231]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 160 loss 9.144437951187424
pred tensor([[66.4823],
        [65.7942],
        [67.1700],
        [66.7188],
        [66.0910]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 180 loss 8.629586622551887
pred tensor([[66.1258],
        [65.7861],
        [67.3025],
        [66.7906],
        [66.1816]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 200 loss 7.862479042715384
pred tensor([[66.5321],
        [66.6127],
        [68.2300],
        [67.5835],
        [66.9969]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 220 loss 8.386688126477596
pred tensor([[67.1114],
        [67.4989],
        [69.0717],
        [68.2701],
        [67.6790]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 240 loss 7.269273354913462
pred tensor([[65.9821],
        [66.7215],
        [68.3015],
        [67.4087],
        [66.9035]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 260 loss 7.272694428336548
pred tensor([[65.7647],
        [66.8586],
        [68.4255],
        [67.3761],
        [66.9467]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 280 loss 6.832346094847361
pred tensor([[66.7335],
        [68.0218],
        [69.4643],
        [68.2573],
        [67.8490]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 300 loss 6.175377081353056
pred tensor([[66.2155],
        [67.8260],
        [69.2545],
        [67.9375],
        [67.6514]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 320 loss 6.1032280099963465
pred tensor([[66.5069],
        [68.4431],
        [69.8380],
        [68.3638],
        [68.1677]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 340 loss 6.98399888055507
pred tensor([[66.7337],
        [68.5597],
        [69.6940],
        [68.2090],
        [68.0193]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 360 loss 5.880924689575411
pred tensor([[66.2012],
        [68.3577],
        [69.5788],
        [68.0546],
        [68.0429]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 380 loss 5.609060542292516
pred tensor([[65.6461],
        [68.1447],
        [69.4208],
        [67.8236],
        [67.9517]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 400 loss 5.430833184691938
pred tensor([[65.8301],
        [68.6003],
        [69.8508],
        [68.1544],
        [68.3337]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 420 loss 5.442364210598432
pred tensor([[65.4333],
        [68.4727],
        [69.6781],
        [67.9046],
        [68.1460]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 440 loss 7.7609978059244185
pred tensor([[67.2096],
        [69.5021],
        [70.2682],
        [68.5115],
        [68.5203]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 460 loss 5.678962600416722
pred tensor([[65.6758],
        [68.1422],
        [68.9580],
        [67.2431],
        [67.4412]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
Epoch: 480 loss 5.340855219866963
pred tensor([[66.2387],
        [68.9559],
        [69.8279],
        [68.0387],
        [68.3300]], dtype=torch.float64, grad_fn=<SliceBackward>)
train tensor([[68.8200],
        [68.1600],
        [66.9900],
        [68.9000],
        [69.9500]], dtype=torch.float64, grad_fn=<SliceBackward>)
In [ ]:
# debug时的草稿

# _=torch.tensor(scaler_close.inverse_transform(y_pred.detach().numpy().reshape(-1,1)),requires_grad=True).double()
# __=torch.tensor(scaler_close.inverse_transform(y_train.reshape(-1,1)),requires_grad=True).double()
# print(_)
# print(__)
# loss_func(_,__)

# hist
In [ ]:
 
In [ ]:
#输出后两天

# y_pred_1 = y_pred_np[:,0]
# # y_pred_1.shape
# y_pred_2 = y_pred_np[:,1]
# # y_pred_2.shape
In [ ]:
#对y进行归一化所需要的

# scaler_Close = preprocessing.MinMaxScaler()
# Close_scaled=scaler_Close.fit_transform(Label.values.astype('float32').reshape(-1,1))
In [ ]:
# y_train.reshape(-1,1)
# y_train.shape
# y_pred_np.shape
# y_test.shape[0]+y_train.shape[0]
# y_train.shape

Evaluating

In [ ]:
# #对y归一化所需要的

# predict_close=scaler_Close.inverse_transform(y_pred_np.astype('float32').reshape(-1,1))
# real_close=scaler_Close.inverse_transform(y_train.astype('float32').reshape(-1,1))
# # predict_close_1=scaler_Close.inverse_transform(y_pred_1.astype('float32').reshape(-1,1))
# # predict_close_2=scaler_Close.inverse_transform(y_pred_2.astype('float32').reshape(-1,1))
# # real_close_1=scaler_Close.inverse_transform(y_train[:,0].astype('float32').reshape(-1,1))
# # real_close_2=scaler_Close.inverse_transform(y_train[:,1].astype('float32').reshape(-1,1))

# predict_close.shape
# # real_close_1.shape

plot train

In [33]:
y_pred_np = y_pred.detach().numpy()
# y_pred_np

y_pred_np = scaler_y.inverse_transform(y_pred.detach().numpy().reshape(-1,1))
y_train = scaler_y.inverse_transform(y_train.reshape(-1,1))

plotly

In [34]:
layout_train = plotly.graph_objs.Layout(
    title='train',
    autosize=False,
    width=800,
    height=600,
    xaxis=dict(
        title = "Date"
    ),
    yaxis=dict(
        title = "Close"
    )
    
)
In [40]:
# trace0 = y_train.tolist
# trace1 = y_pred_np.tolist
# trace = [trace0,trace1]
y_pred_np.shape
Out[40]:
(829, 1)
In [41]:
trace0 = plotly.graph_objs.Scatter(
#         x= ,
        y=y_train.reshape(y_train.shape[0]),
        mode='lines',
#         line=dict(
#             color = clr
#         )
        name='real'
    )
trace1 = plotly.graph_objs.Scatter(
#         x = ,
        y = y_pred_np.reshape(y_pred_np.shape[0]),
        mode = 'lines',
        name = 'pred'
    )
In [42]:
traces_train = [trace0,trace1]

fig_train = plotly.graph_objs.Figure(data=traces_train, layout=layout_train)


plotly.offline.iplot(fig_train)

In [ ]:
# train_df=pd.DataFrame({'predict':y_pred_np[900:].reshape(y_pred_np[900:].shape[0]),
#                        'real':y_train[900:].reshape(y_train[900:].shape[0])})
train_df=pd.DataFrame({'predict':y_pred_np.reshape(y_pred_np.shape[0]),
                       'real':y_train.reshape(y_train.shape[0])})
train_df['predict'].plot(label='predict',linewidth = 0.3)
train_df['real'].plot(label='real',linewidth = 0.3)
plt.legend()
plt.title('Closing Price Train ')
# plt.savefig('./hsmd/closing_train.png')



# train_df=pd.DataFrame({'predict_1':predict_close_1.reshape(predict_close_1.shape[0]),'predict_2':predict_close_2.reshape(predict_close_2.shape[0]),
#                        'real_1':real_close_1.reshape(real_close_1.shape[0]),'real_2':real_close_2.reshape(real_close_2.shape[0])})
# train_df['predict_1'].plot(label='predict_1',linewidth = 0.3)
# train_df['real_1'].plot(label='real_1',linewidth = 0.3)
# plt.legend()
# plt.title('Closing Price Train 1')
# plt.savefig('./hsmd/closing_train_1.png')
In [ ]:
# train_df['predict_2'].plot(label='predict_2',linewidth = 0.3)
# train_df['real_2'].plot(label='real_2',linewidth = 0.3)
# plt.legend()
# plt.title('Closing Price Train 2')
# plt.savefig('./hsmd/closing_train_2.png')

plot test

In [43]:
test_pred = Lstm(X_test_t).detach().numpy()
# predict_close_test=scaler_Close.inverse_transform(prediction.astype('float32'))
# real_close_test=scaler_Close.inverse_transform(y_test.astype('float32'))

# test_pred.shape

test_pred = scaler_y.inverse_transform(Lstm(X_test_t).detach().numpy().reshape(-1,1))
y_test = scaler_y.inverse_transform(y_test.reshape(-1,1))
In [44]:
layout_test = plotly.graph_objs.Layout(
    title='test',
    autosize=False,
    width=800,
    height=600,
    xaxis=dict(
        title = "Date"
    ),
    yaxis=dict(
        title = "Close"
    )
    
)
In [45]:
trace0_ = plotly.graph_objs.Scatter(
#         x= ,
        y=y_test.reshape(y_test.shape[0]),
        mode='lines',
#         line=dict(
#             color = clr
#         )
        name='real'
    )
trace1_ = plotly.graph_objs.Scatter(
#         x = ,
        y = test_pred.reshape(test_pred.shape[0]),
        mode = 'lines',
        name = 'pred'
    )
In [46]:
traces_test = [trace0_,trace1_]

fig_test = plotly.graph_objs.Figure(data=traces_test, layout=layout_test)


plotly.offline.iplot(fig_test)
In [ ]:
max((test_pred-y_test))
min((test_pred-y_test))
In [ ]:
test_df=pd.DataFrame({'predict':test_pred.reshape(test_pred.shape[0]),
                      'real':y_test.reshape(y_test.shape[0])})
test_df['predict'].plot(label='predict',linewidth = 0.3)
test_df['real'].plot(label='real',linewidth = 0.3)
plt.legend()
plt.title('Closing Price Test')
# plt.savefig('./hsmd/temp')

# test_df=pd.DataFrame({'predict_1':predict_close_test[:,0].reshape(predict_close_test[:,0].shape[0]),
#                       'predict_2':predict_close_test[:,1].reshape(predict_close_test[:,1].shape[0]),
#                       'real_1':real_close_test[:,0].reshape(real_close_test[:,0].shape[0]),
#                      'real_2':real_close_test[:,1].reshape(real_close_test[:,1].shape[0])})
# test_df['predict_1'].plot(label='predict_1',linewidth = 0.3)
# test_df['real_1'].plot(label='real_1',linewidth = 0.3)
# plt.legend()
# plt.title('Closing Price Test 1')
# plt.savefig('./hsmd/closing_test_1.png')
In [ ]:
# test_df['predict_2'].plot(label='predict_2',linewidth = 0.3)
# test_df['real_2'].plot(label='real_2',linewidth = 0.3)
# plt.legend()
# plt.title('Closing Price Test 2')
# plt.savefig('./hsmd/closing_test_2.png')

plot DATA

In [47]:
data_real = np.append(y_train,y_test)
data_pred = np.append(y_pred_np,test_pred)
In [48]:
layout_data = plotly.graph_objs.Layout(
    title='data',
    autosize=False,
    width=800,
    height=600,
    xaxis=dict(
        title = "Date"
    ),
    yaxis=dict(
        title = "Close"
    )
    
)

trace0_d = plotly.graph_objs.Scatter(
#         x= ,
        y=data_real.reshape(data_real.shape[0]),
        mode='lines',
#         line=dict(
#             color = clr
#         )
        name='real'
    )
trace1_d = plotly.graph_objs.Scatter(
#         x = ,
        y = data_pred.reshape(data_pred.shape[0]),
        mode = 'lines',
        name = 'pred'
    )
In [49]:
traces_d = [trace0_d,trace1_d]

fig_d = plotly.graph_objs.Figure(data=traces_d, layout=layout_data)


plotly.offline.iplot(fig_d)
In [ ]:
# data_df.max().max()
In [ ]:
data_df = pd.concat([train_df,test_df],axis = 0,ignore_index=True)
# print(data_df.head(),data_df.tail())
# 
data_df['predict'].plot(label='predict',linewidth = 0.3)
data_df['real'].plot(label='real',linewidth = 0.3)
plt.legend()
plt.title('Closing Price DATA ')
plt.vlines(train_df.shape[0],data_df.min().min(),data_df.max().max(),colors='r',linestyles='dashed',label='train test',linewidth=0.2)
# plt.savefig('./hsmd/closing_data.png')


# data_df['predict_1'].plot(label='predict_1',linewidth = 0.3)
# data_df['real_1'].plot(label='real_1',linewidth = 0.3)
# plt.legend()
# plt.title('Closing Price DATA 1 ')
# plt.vlines(train_df.shape[0],0,120,colors='r',linestyles='dashed',label='train test',linewidth=0.2)
# plt.savefig('./hsmd/closing_data_1.png')
In [ ]:
# data_df['predict_2'].plot(label='predict_2',linewidth = 0.3)
# data_df['real_2'].plot(label='real_2',linewidth = 0.3)
# plt.legend()
# plt.title('Closing Price DATA 2')
# plt.vlines(train_df.shape[0],0,120,colors='r',linestyles='dashed',label='train test',linewidth=0.2)
# plt.savefig('./hsmd/closing_data_2.png')
In [ ]:
# print(y_pred.detach().numpy())
# y_pred.detach().numpy()[1,1]
# y_train[1,1]
In [ ]:
# plt.figure()
# plt.plot(y_pred.detach().numpy()[:,0], label="Preds_1",linewidth = 2.0)
# plt.plot(y_train[:,0], label="Data_1")
# plt.legend()
# plt.show()

# plt.figure()
# plt.plot(y_pred.detach().numpy()[:,1], label="Preds_2",linewidth = 2.0)
# plt.plot(y_train[:,1], label="Data_2")
# plt.legend()
# plt.show()

plot loss

In [50]:
layout_loss = plotly.graph_objs.Layout(
    title='loss',
    autosize=False,
    width=800,
    height=600,
    xaxis=dict(
        title = "epoch"
    ),
    yaxis=dict(
        title = "loss"
    )
    
)

loss0 = plotly.graph_objs.Scatter(
#         x= ,
        y=hist,
        mode='lines',
#         line=dict(
#             color = clr
#         )
        name='Training loss'
    )
loss1 = plotly.graph_objs.Scatter(
#         x = ,
        y = hist_test,
        mode = 'lines',
        name = 'Testing loss'
    )
In [51]:
loss_trace = [loss0,loss1]
fig_loss = plotly.graph_objs.Figure(data=loss_trace, layout=layout_loss)


plotly.offline.iplot(fig_loss)
In [ ]:
plt.figure()
plt.plot(hist, label="Training loss",linewidth=0.3)
plt.plot(hist_test,label="Testing loss",linewidth = 0.3)
plt.legend()
# plt.savefig('./hsmd/loss1.png')
plt.show()
In [ ]:
#epoch 100之后的loss
plt.figure()
plt.plot(hist[100:], label="Training loss",linewidth=0.3)
plt.plot(hist_test[100:],label="Testing loss",linewidth = 0.3)
plt.legend()
# plt.savefig('./hsmd/loss1_100.png')
plt.show()

others

In [ ]:
hist[-1]
In [ ]:
hist.min()
In [ ]:
hist_test.min()
In [ ]:
hist_test[-1]
In [ ]:
prediction = Lstm(X_test_t)

loss = loss_func(prediction,y_test_t)

prediction

y_test_t

plt.figure()
plt.plot(prediction.detach().numpy(), label="Preds",linewidth = 2.0)
plt.plot(y_test, label="Data")
plt.legend()
plt.show()
In [ ]:
y.shape
In [ ]:
DATA = y[:,0].reshape((-1,1))
print(DATA.shape)
PRED = np.vstack([y_pred.detach().numpy(),prediction.detach().numpy()])
In [ ]:
plt.figure()
plt.plot(PRED, label="Preds",linewidth = 2.0)
plt.plot(DATA, label="Data")
plt.legend()
plt.show()

v6:Huge Stock Market Analysis

  • v6改变:
    • 对于所有价格取log进行平稳化

Data Preprocessing

get data,take log,add return

In [ ]:
stock = pd.read_csv('./hsmd/a.us.txt')
print(stock.head())
In [ ]:
stock[['Open','High','Low','Close']]= np.log(stock[['Open','High','Low','Close']])
In [ ]:
stock.head()
In [ ]:
stock['Return'] = stock.Close.diff(1)
stock = stock[['Open','High','Low','Close','Volume','Return']]
stock.dropna(inplace=True)#注意dropna和label的顺序 不然维数不一样
Label = stock.Close

# print(re)
print(stock.head())
# print(Label.shape)
print(stock.shape)
In [ ]:
# scaler = preprocessing.MinMaxScaler()

# V_values=stock.Volume.values.reshape((-1,1))

# V_values

# scaler.fit(V_values)

# Volume_=scaler.transform(V_values)

# Volume_

# stock['Volume']=Volume_
# print(stock.head())

convert time series to supervise, keep Open

In [ ]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg
In [ ]:
stock_sv = series_to_supervised(stock,15,1)
# print(stock_sv.head())
# print(stock_sv.columns)
# Close = stock_sv['var4(t)'].values
# print(Close)

# Stock = stock_sv.drop(stock_sv.columns[[-1,-2,-4,-5,-6,-7,-8,-10,-11,-12]],axis=1)
Stock = stock_sv.drop(stock_sv.columns[[-1,-2,-4,-5]],axis=1)
print(Stock.head())

normalization

In [ ]:
# 对于所有特征进行normalization,y不进行
X = Stock.values[:,:-1]
y = Stock.values[:,-1]
In [ ]:
# 对于所有特征进行normalization,y不进行
scaler = preprocessing.MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
print(X)
In [ ]:
y = np.exp(y).reshape(-1,1)
print(X.shape,y.shape)
print(y)

scaler_y = preprocessing.MinMaxScaler()
scaler_y.fit(y)
y = scaler_y.transform(y)
print(y)

train-test-split

In [ ]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,shuffle = False)


print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

to LSTM 3D, to tensor

In [ ]:
X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0],1,X_test.shape[1]))
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

# y_train,y_test = y_train.reshape((-1,2)), y_test.reshape((-1,2))
y_train,y_test = y_train.reshape((-1,1)), y_test.reshape((-1,1))
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
  • to tensor
In [ ]:
X_train_t = torch.from_numpy(X_train).type(torch.Tensor)
y_train_t= torch.from_numpy(y_train).type(torch.Tensor)

X_test_t = torch.from_numpy(X_test).type(torch.Tensor)
y_test_t = torch.from_numpy(y_test).type(torch.Tensor)

print(X_train_t)
print(X_train_t.shape)

Refining Model, input_size = 91

In [ ]:
EPOCHS = 500             
BATCH_SIZE = 20
TIME_STEP =  1         #上面已经手动把15天的feature搞在一起了
INPUT_SIZE =  91      # X_train.shape[2]
HIDDEN_SIZE = 30
LR = 0.01     # learning rate
# OUTPUT_SIZE = 2
OUTPUT_SIZE = 1
In [ ]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()
        
        self.LSTM = nn.LSTM(
            input_size = INPUT_SIZE,
            hidden_size = HIDDEN_SIZE,
            num_layers = 2,
            batch_first = True
        )
        
#         self.out = nn.Linear(HIDDEN_SIZE,2)
        self.out = nn.Linear(HIDDEN_SIZE,OUTPUT_SIZE)
    
    def forward(self,x):
        # x.shape (batch,time_step,input_size)
        #r_out.shape (batch,time_step,output_size)
        #h_n.shape (n_layers,batch,hidden_size)
        #h_c.shape (n_layers,batch,hidden_size)
        r_out, (h_n,h_c) = self.LSTM(x,None)
        #None : zero initial hidden state
        res = self.out(r_out[:,-1,:])
        return res
    
Lstm = LSTM()
print(Lstm)
In [ ]:
optimizer = torch.optim.Adam(Lstm.parameters(),lr=LR)
loss_func = nn.MSELoss()

Training Model

  • debug
In [ ]:
# H = np.zeros(EPOCHS)
# P = Lstm(X_train_t)

# print(P)
# y_train_t.max()

# loss_func(P,y_train_t).item()

# temp=torch.from_numpy(np.exp(Lstm(X_train_t).detach().numpy()))
  • training
In [ ]:
# hist = np.zeros(EPOCHS)
# hist_test = np.zeros(EPOCHS)
# # hist_exp = np.zeros(EPOCHS)
# # hist_exp_test = np.zeros(EPOCHS)
# for t in range(EPOCHS):
#     y_pred = Lstm(X_train_t)
    
#     loss = loss_func(y_pred,y_train_t)
# #     loss_exp = loss_func(torch.from_numpy(np.exp(y_pred.detach().numpy())),np.exp(y_train_t))
#     if t % 20 == 0:
#         print('Epoch:',t,'loss',loss.item())
#         print('pred',y_pred.detach().numpy()[-3:])
#         print('train',y_train[-3:])
#     hist[t] = loss.item()
    
#     optimizer.zero_grad()
    
#     loss.backward()
    
#     optimizer.step()
    
#     test_pre = Lstm(X_test_t)
#     loss_test = loss_func(test_pre,y_test_t)
# #     loss_exp_test = loss_func(torch.from_numpy(np.exp(test_pre.detach().numpy())),np.exp(y_test_t))
#     hist_test[t] = loss_test.item()
    
hist = np.zeros(EPOCHS)
# hist_inv = np.zeros(EPOCHS) #对y进行归一化后,记录去归一化之后的结果,没必要所以全部注释掉
hist_test = np.zeros(EPOCHS)
# hist_test_inv = np.zeros(EPOCHS)
for t in range(EPOCHS):
    y_pred = Lstm(X_train_t)
    y_pred_inv = torch.tensor(scaler_y.inverse_transform(y_pred.detach().numpy().reshape(-1,1)),requires_grad=True).double()
    y_train_inv = torch.tensor(scaler_y.inverse_transform(y_train.reshape(-1,1)),requires_grad=True).double()
    loss_inv = loss_func(y_pred_inv,y_train_inv)
    loss = loss_func(y_pred,y_train_t)
    if t % 20 == 0:
#         print('Epoch:',t,'loss',loss.item())
#         print('pred',y_pred.detach().numpy()[-3:])
#         print('train',y_train[-3:])

        print('Epoch:',t,'loss',loss_inv.item())
        print('pred',y_pred_inv[:5])
        print('train',y_train_inv[:5])
#     hist[t] = loss.item()
    hist[t] = loss_inv.item()
    
    optimizer.zero_grad()
    
    loss.backward()
#     loss_inv.backward()

    
    optimizer.step()
    
    test_pre = Lstm(X_test_t)
    y_test_pred_inv = torch.tensor(scaler_y.inverse_transform(test_pre.detach().numpy().reshape(-1,1)),requires_grad=True).double()
    y_test_inv = torch.tensor(scaler_y.inverse_transform(y_test.reshape(-1,1)),requires_grad=True).double()
    loss_test = loss_func(test_pre,y_test_t)
    loss_test_inv = loss_func(y_test_pred_inv,y_test_inv)
#     hist_test[t] = loss_test.item()
    hist_test[t]=loss_test_inv.item()

Evaluating

train loss

In [ ]:
plt.figure()
# plt.xlim(xmin=300,xmax=500)
# plt.ylim(ymax=400)
plt.plot(hist, label="Training loss",linewidth=0.3)
plt.plot(hist_test,label="Testing loss",linewidth = 0.3)
# plt.plot(hist_inv, label="Training loss",linewidth=0.3)
# plt.plot(hist_test_inv,label="Testing loss",linewidth = 0.3)
plt.legend()
# plt.savefig('./hsmd/loss1.png')
plt.show()

plot train

In [ ]:
y_pred_np = y_pred.detach().numpy()
y_pred_np
In [ ]:
y_pred_np = scaler_y.inverse_transform(y_pred.detach().numpy().reshape(-1,1))
y_train = scaler_y.inverse_transform(y_train.reshape(-1,1))
In [ ]:
train_df=pd.DataFrame({'predict':y_pred_np.reshape(y_pred_np.shape[0]),'real':y_train.reshape(y_train.shape[0])})
train_df['predict'].plot(label='predict',linewidth = 0.3)
train_df['real'].plot(label='real',linewidth = 0.3)
plt.legend()
plt.title('Closing Price Train ')
# plt.savefig('./hsmd/closing_train.png')

plot test

In [ ]:
prediction = Lstm(X_test_t).detach().numpy()
In [ ]:
test_pred = scaler_y.inverse_transform(Lstm(X_test_t).detach().numpy().reshape(-1,1))
y_test = scaler_y.inverse_transform(y_test.reshape(-1,1))
In [ ]:
test_df=pd.DataFrame({'predict':test_pred.reshape(test_pred.shape[0]),
                      'real':y_test.reshape(y_test.shape[0])})
test_df['predict'].plot(label='predict',linewidth = 0.3)
test_df['real'].plot(label='real',linewidth = 0.3)
plt.legend()
plt.title('Closing Price Test')

plot DATA

In [ ]:
data_df = pd.concat([train_df,test_df],axis = 0,ignore_index=True)
# print(data_df.head(),data_df.tail())
# 
data_df['predict'].plot(label='predict',linewidth = 0.3)
data_df['real'].plot(label='real',linewidth = 0.3)
plt.legend()
plt.title('Closing Price DATA ')
plt.vlines(train_df.shape[0],0,120,colors='r',linestyles='dashed',label='train test',linewidth=0.2)
# plt.savefig('./hsmd/closing_data.png')

Consider more than one company, choose 26x2(52) companies randomly

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import os
path = 'D:\jupyter pytorch\hsmd\dataset\Stocks'
os.chdir(path)
import torch.nn as nn
torch.manual_seed(12)
Out[1]:
<torch._C.Generator at 0x1f29ac9bc70>
In [2]:
import random


import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
# import plotly.plotly as py

import heapq

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [3]:
plt.rcParams['savefig.dpi'] = 400 #图片像素
plt.rcParams['figure.dpi'] = 400 #分辨率

Choose companies randomly from this huge dataset

get data, use random.sample() to choose 52 companies randomly from this huge dataset

In [4]:
#read data
# kernels let us navigate through the zipfile as if it were a directory

# trying to read a file of size zero will throw an error, so skip them
random.seed(12345)
filenames = [x for x in os.listdir() if x.endswith('.txt') and os.path.getsize(x) > 0]
filenames = random.sample(filenames,52)
print(filenames)
# filenames = ['intu.us.txt', 'spib.us.txt', 'acv.us.txt', 'veon.us.txt', 'vly-ws.us.txt', 'udbi.us.txt', 'flbr.us.txt', 'wsm.us.txt', 'hcm.us.txt', 'cts.us.txt']
['intu.us.txt', 'spib.us.txt', 'acv.us.txt', 'veon.us.txt', 'vly-ws.us.txt', 'udbi.us.txt', 'flbr.us.txt', 'wsm.us.txt', 'hcm.us.txt', 'cts.us.txt', 'esgr.us.txt', 'nrk.us.txt', 'jhs.us.txt', 'clns_c-cl.us.txt', 'hewi.us.txt', 'cae.us.txt', 'zyme.us.txt', 'jdiv.us.txt', 'ens.us.txt', 'nni.us.txt', 'pko.us.txt', 'cof_d.us.txt', 'pch.us.txt', 'nflt.us.txt', 'crvl.us.txt', 'gshtu.us.txt', 'sqlv.us.txt', 'spil.us.txt', 'bgr.us.txt', 'mtch.us.txt', 'spsb.us.txt', 'indf.us.txt', 'ocio.us.txt', 'mexx.us.txt', 'cmo.us.txt', 'cha.us.txt', 'cznc.us.txt', 'slg_i.us.txt', 'banfp.us.txt', 'csse.us.txt', 'wat.us.txt', 'gluu.us.txt', 'gabr.us.txt', 'ahc.us.txt', 'kof.us.txt', 'gldw.us.txt', 'ainv.us.txt', 'miii.us.txt', 'sa.us.txt', 'ssys.us.txt', 'insm.us.txt', 'abe.us.txt']
In [5]:
data = []
for filename in filenames:
    df = pd.read_csv(filename, sep=',')

    label, _, _ = filename.split(sep='.')
    df['Label'] = label
    df['Date'] = pd.to_datetime(df['Date'])
    data.append(df)
In [ ]:
# random.randint(0,255)
# dict
# r = lambda: random.randint(0,90)#画图的颜色
# str(r()) + str(r()) + str(r())
In [ ]:
# random.seed(1)
# print(random.randint(1,10))
# random.randint(1,10)
plotly.__version__
In [ ]:
temp = data[1].head(10)
In [ ]:
print(temp.head(10))
# temp.sort_values('Date')
# temp['Label'].iloc[1]

EDA on 52 companies' data

use plotly to plot these companies' Close in one figure

In [6]:
# r = lambda: random.randint(0,255)#画图的颜色
traces = []

for df in data:
#     clr = str(r()) + str(r()) + str(r())
#     df = df.sample(n=100, replace=True)
    df = df.sort_values('Date')
#     print(df['Label'])
    label = df['Label'].iloc[0]#取一个label, 0 1 2其实都一样

    trace = plotly.graph_objs.Scatter(
        x=df['Date'],
        y=df['Close'],
        mode='lines',
#         line=dict(
#             color = clr
#         )
        name=label
    )
    traces.append(trace)
    
In [7]:
layout = plotly.graph_objs.Layout(
    title='Plot',
    autosize=False,
    width=800,
    height=600,
    xaxis=dict(
        title = "Date"
    ),
    yaxis=dict(
        title = "Close"
    )
    
)
In [8]:
fig = plotly.graph_objs.Figure(data=traces, layout=layout)


plotly.offline.iplot(fig, filename='dataplot')
In [ ]:
# temp = data[0].sort_values('Date')
# label = temp['Label'].iloc[0]
# temp['Close'].std()

find companies with top 10 std of Close and create a new list containing these 10

In [9]:
stds = [] #用于储存每个company的Close std
labels = []#储存每个company的名字
for df in data:
    df = df.sort_values('Date')
    label = df['Label'].iloc[0]
    std = df['Close'].std()
    stds.append(std)
    labels.append(label)
In [10]:
#定义函数 返回前n大的位置和数值
def getMaxIndex(num_list,topk=1):
    max_num = heapq.nlargest(topk,num_list)
    max_num_index=list(map(num_list.index, max_num))
    return max_num_index,max_num
In [11]:
#找到std中前10大的公司
max_10_index,max_10_value=getMaxIndex(stds,10)
In [12]:
#将上面选中的公司 组合在一起 形成新的data
new_data = [data[i] for i in max_10_index]
In [13]:
[labels[i] for i in max_10_index]
Out[13]:
['wat', 'esgr', 'intu', 'kof', 'ssys', 'ens', 'wsm', 'crvl', 'nni', 'cha']
In [14]:
# new_data[0]
new_traces = [traces[i] for i in max_10_index]
In [15]:
new_fig = plotly.graph_objs.Figure(data=new_traces, layout=layout)

plotly.offline.iplot(new_fig,filename='D:\jupyter pytorch\hsmd\newdata.html')
# plotly.offline.save
# 

Use LSTM above to make prediction for these companies' Close

  • compare different performace for different companies
In [ ]:
new_data[0].head()
In [ ]:
stock = new_data[4].copy()
# stock = stock.iloc[2000:,:]
stock.head()
In [ ]: